In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sb 
import matplotlib.pyplot as plt 

from sklearn.metrics import mean_absolute_error as mae 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, LabelEncoder 
from sklearn.linear_model import LinearRegression 
from xgboost import XGBRegressor 
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor 

import warnings 
warnings.filterwarnings('ignore')

The food server of a restaurant recorded data about the tips given to the waiters for serving the food. The data recorded by the food server is as follows:¶

total_bill: Total bill in dollars including taxes tip: Tip given to waiters in dollars sex: gender of the person paying the bill smoker: whether the person smoked or not day: day of the week time: lunch or dinner size: number of people in a table So this is the data recorded by the restaurant. Based on this data, our task is to find the factors affecting waiter tips and train a machine learning model to predict the waiter’s tipping.

In [2]:
df = pd.read_csv('tips.csv') 
df.head()
Out[2]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [3]:
df.shape
Out[3]:
(244, 7)
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB
In [5]:
df.describe().T
Out[5]:
count mean std min 25% 50% 75% max
total_bill 244.0 19.785943 8.902412 3.07 13.3475 17.795 24.1275 50.81
tip 244.0 2.998279 1.383638 1.00 2.0000 2.900 3.5625 10.00
size 244.0 2.569672 0.951100 1.00 2.0000 2.000 3.0000 6.00
In [6]:
df.size
Out[6]:
1708
In [7]:
df.columns
Out[7]:
Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

EDA¶

In [8]:
df.isnull()
Out[8]:
total_bill tip sex smoker day time size
0 False False False False False False False
1 False False False False False False False
2 False False False False False False False
3 False False False False False False False
4 False False False False False False False
... ... ... ... ... ... ... ...
239 False False False False False False False
240 False False False False False False False
241 False False False False False False False
242 False False False False False False False
243 False False False False False False False

244 rows × 7 columns

In [9]:
df.isnull().sum()
Out[9]:
total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

Data Visualization¶

In [10]:
plt.subplots(figsize=(15,8)) 
  
for i, col in enumerate(['total_bill', 'tip']): 
    plt.subplot(2,3, i + 1) 
    sb.distplot(df[col]) 
plt.tight_layout() 
plt.show()
In [11]:
import seaborn as sns
sns.pairplot(data = df, hue = 'tip')
Out[11]:
<seaborn.axisgrid.PairGrid at 0x2972dc881d0>
In [12]:
pd.plotting.scatter_matrix(df)
Out[12]:
array([[<Axes: xlabel='total_bill', ylabel='total_bill'>,
        <Axes: xlabel='tip', ylabel='total_bill'>,
        <Axes: xlabel='size', ylabel='total_bill'>],
       [<Axes: xlabel='total_bill', ylabel='tip'>,
        <Axes: xlabel='tip', ylabel='tip'>,
        <Axes: xlabel='size', ylabel='tip'>],
       [<Axes: xlabel='total_bill', ylabel='size'>,
        <Axes: xlabel='tip', ylabel='size'>,
        <Axes: xlabel='size', ylabel='size'>]], dtype=object)
In [13]:
import plotly.express as px
import plotly.graph_objects as go
figure = px.scatter(data_frame = df, x="total_bill",
                    y="tip", size="size", color= "day", trendline="ols")
figure.show()
In [14]:
figure = px.scatter(data_frame = df, x="total_bill",
                    y="tip", size="size", color= "sex", trendline="ols")
figure.show()
In [15]:
figure = px.scatter(data_frame = df, x="total_bill",
                    y="tip", size="size", color= "time", trendline="ols")
figure.show()
In [16]:
figure = px.pie(df, values='tip', names='day',hole = 0.5)
figure.show()
In [17]:
figure = px.pie(df, values='tip',names='sex',hole = 0.5)
figure.show()
In [18]:
figure = px.pie(df, values='tip',names='smoker',hole = 0.5)
figure.show()
In [19]:
figure = px.pie(df, 
             values='tip', 
             names='time',hole = 0.5)
figure.show()
In [20]:
# Assuming 'df' is your DataFrame
# Drop non-numeric columns
numeric_df = df.select_dtypes(include=['number'])
In [21]:
numeric_df
Out[21]:
total_bill tip size
0 16.99 1.01 2
1 10.34 1.66 3
2 21.01 3.50 3
3 23.68 3.31 2
4 24.59 3.61 4
... ... ... ...
239 29.03 5.92 3
240 27.18 2.00 2
241 22.67 2.00 2
242 17.82 1.75 2
243 18.78 3.00 2

244 rows × 3 columns

In [22]:
# Compute correlation matrix
correlation_matrix = numeric_df.corr()
In [23]:
correlation_matrix
Out[23]:
total_bill tip size
total_bill 1.000000 0.675734 0.598315
tip 0.675734 1.000000 0.489299
size 0.598315 0.489299 1.000000
In [24]:
# Plot heatmap
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, cmap='YlGnBu', annot=True)
plt.show()
In [25]:
plt.subplots(figsize=(15,8)) 
  
for i, col in enumerate(['total_bill', 'tip']): 
  plt.subplot(2,3, i + 1) 
  sb.boxplot(df[col]) 
plt.tight_layout() 
plt.show()
In [26]:
df.shape, df[(df['total_bill']<45) & (df['tip']<7)].shape
Out[26]:
((244, 7), (238, 7))
In [27]:
import seaborn as sb
import matplotlib.pyplot as plt

feat = df.loc[:, 'sex':'size'].columns

plt.subplots(figsize=(15, 8))

for i, col in enumerate(feat):
    plt.subplot(2, 3, i + 1)
    sb.countplot(data=df, x=col)
    
plt.tight_layout()
plt.show()
In [28]:
plt.scatter(df['total_bill'], df['tip']) 
plt.title('Total Bill v/s Total Tip') 
plt.xlabel('Total Bill') 
plt.ylabel('Total Tip') 
plt.show()
In [29]:
# Convert 'size' column to numeric type if it's not already numeric
df['size'] = pd.to_numeric(df['size'], errors='coerce')
In [30]:
df['size']
Out[30]:
0      2
1      3
2      3
3      2
4      4
      ..
239    3
240    2
241    2
242    2
243    2
Name: size, Length: 244, dtype: int64
In [31]:
# Drop rows with NaN values in the 'size' column if needed
df = df.dropna(subset=['size'])
In [32]:
df
Out[32]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
... ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat Dinner 3
240 27.18 2.00 Female Yes Sat Dinner 2
241 22.67 2.00 Male Yes Sat Dinner 2
242 17.82 1.75 Male No Sat Dinner 2
243 18.78 3.00 Female No Thur Dinner 2

244 rows × 7 columns

In [33]:
df["sex"] = df["sex"].map({"Female": 0, "Male": 1})
df["smoker"] = df["smoker"].map({"No": 0, "Yes": 1})
df["day"] = df["day"].map({"Thur": 0, "Fri": 1, "Sat": 2, "Sun": 3})
df["time"] = df["time"].map({"Lunch": 0, "Dinner": 1})
df.head()
Out[33]:
total_bill tip sex smoker day time size
0 16.99 1.01 0 0 3 1 2
1 10.34 1.66 1 0 3 1 3
2 21.01 3.50 1 0 3 1 3
3 23.68 3.31 1 0 3 1 2
4 24.59 3.61 0 0 3 1 4
In [34]:
df.corr()
Out[34]:
total_bill tip sex smoker day time size
total_bill 1.000000 0.675734 0.144877 0.085721 0.173693 0.183118 0.598315
tip 0.675734 1.000000 0.088862 0.005929 0.135499 0.121629 0.489299
sex 0.144877 0.088862 1.000000 0.002816 0.230791 0.205231 0.086195
smoker 0.085721 0.005929 0.002816 1.000000 -0.032653 0.054921 -0.133178
day 0.173693 0.135499 0.230791 -0.032653 1.000000 0.874366 0.165350
time 0.183118 0.121629 0.205231 0.054921 0.874366 1.000000 0.103411
size 0.598315 0.489299 0.086195 -0.133178 0.165350 0.103411 1.000000
In [35]:
plt.figure(figsize=(7,7)) 
sb.heatmap(df.corr() > 0.7, annot = True, cmap='YlGnBu') 
plt.show()
In [36]:
features = df.drop('tip', axis=1) 
target = df['tip'] 
  
X_train, X_val, Y_train, Y_val = train_test_split(features, target, test_size=0.2, random_state=22) 
X_train.shape, X_val.shape
Out[36]:
((195, 6), (49, 6))
In [37]:
X_train
Out[37]:
total_bill sex smoker day time size
40 16.04 1 0 2 1 3
176 17.89 1 1 3 1 2
18 16.97 0 0 3 1 3
229 22.12 0 1 2 1 2
50 12.54 1 0 3 1 2
... ... ... ... ... ... ...
100 11.35 0 1 1 1 2
192 28.44 1 1 0 0 2
44 30.40 1 0 3 1 4
132 11.17 0 0 0 0 2
117 10.65 0 0 0 0 2

195 rows × 6 columns

In [38]:
X_val
Out[38]:
total_bill sex smoker day time size
200 18.71 1 1 0 0 3
112 38.07 1 0 3 1 3
128 11.38 0 0 0 0 2
179 34.63 1 1 3 1 2
49 18.04 1 0 3 1 2
15 21.58 1 0 3 1 2
184 40.55 1 1 3 1 2
213 13.27 0 1 2 1 2
32 15.06 0 0 2 1 2
55 19.49 1 0 3 1 2
36 16.31 1 0 2 1 3
174 16.82 1 1 3 1 2
171 15.81 1 1 2 1 2
207 38.73 1 1 2 1 4
28 21.70 1 0 2 1 2
154 19.77 1 0 3 1 4
85 34.83 0 0 0 0 4
94 22.75 0 0 1 1 2
12 15.42 1 0 3 1 2
190 15.69 1 1 3 1 2
183 23.17 1 1 3 1 4
26 13.37 1 0 2 1 2
41 17.46 1 0 3 1 2
142 41.19 1 0 0 0 5
46 22.23 1 0 3 1 2
78 22.76 1 0 0 0 2
71 17.07 0 0 2 1 3
56 38.01 1 1 2 1 4
140 17.47 0 0 0 0 2
224 13.42 1 1 1 0 2
2 21.01 1 0 3 1 3
20 17.92 1 0 2 1 2
75 10.51 1 0 2 1 2
165 24.52 1 0 3 1 3
87 18.28 1 0 0 0 2
148 9.78 1 0 0 0 2
30 9.55 1 0 2 1 2
110 14.00 1 0 2 1 2
218 7.74 1 1 2 1 2
238 35.83 0 0 2 1 3
186 20.90 0 1 3 1 3
90 28.97 1 1 1 1 2
61 13.81 1 1 2 1 2
114 25.71 0 0 3 1 3
73 25.28 0 1 2 1 2
153 24.55 1 0 3 1 4
178 9.60 0 1 3 1 2
95 40.17 1 1 1 1 4
189 23.10 1 1 3 1 3
In [39]:
Y_train
Out[39]:
40     2.24
176    2.00
18     3.50
229    2.88
50     2.50
       ... 
100    2.50
192    2.56
44     5.60
132    1.50
117    1.50
Name: tip, Length: 195, dtype: float64
In [40]:
Y_val
Out[40]:
200    4.00
112    4.00
128    2.00
179    3.55
49     3.00
15     3.92
184    3.00
213    2.50
32     3.00
55     3.51
36     2.00
174    4.00
171    3.16
207    3.00
28     4.30
154    2.00
85     5.17
94     3.25
12     1.57
190    1.50
183    6.50
26     2.00
41     2.54
142    5.00
46     5.00
78     3.00
71     3.00
56     3.00
140    3.50
224    1.58
2      3.50
20     4.08
75     1.25
165    3.48
87     4.00
148    1.73
30     1.45
110    3.00
218    1.44
238    4.67
186    3.50
90     3.00
61     2.00
114    4.00
73     5.00
153    2.00
178    4.00
95     4.73
189    4.00
Name: tip, dtype: float64
In [41]:
scaler = StandardScaler() 
X_train = scaler.fit_transform(X_train) 
X_val = scaler.transform(X_val)
In [42]:
X_train
Out[42]:
array([[-0.37642936,  0.79056942, -0.78202957,  0.29530609,  0.65865281,
         0.43387166],
       [-0.16555807,  0.79056942,  1.27872403,  1.15477905,  0.65865281,
        -0.58546537],
       [-0.2704238 , -1.26491106, -0.78202957,  1.15477905,  0.65865281,
         0.43387166],
       ...,
       [ 1.26038778,  0.79056942, -0.78202957,  1.15477905,  0.65865281,
         1.45320869],
       [-0.93153378, -1.26491106, -0.78202957, -1.42363982, -1.51825055,
        -0.58546537],
       [-0.99080571, -1.26491106, -0.78202957, -1.42363982, -1.51825055,
        -0.58546537]])
In [43]:
X_val
Out[43]:
array([[-0.0720908 ,  0.79056942,  1.27872403, -1.42363982, -1.51825055,
         0.43387166],
       [ 2.13464874,  0.79056942, -0.78202957,  1.15477905,  0.65865281,
         0.43387166],
       [-0.90759704, -1.26491106, -0.78202957, -1.42363982, -1.51825055,
        -0.58546537],
       [ 1.74254213,  0.79056942,  1.27872403,  1.15477905,  0.65865281,
        -0.58546537],
       [-0.1484604 ,  0.79056942, -0.78202957,  1.15477905,  0.65865281,
        -0.58546537],
       [ 0.25504466,  0.79056942, -0.78202957,  1.15477905,  0.65865281,
        -0.58546537],
       [ 2.41733025,  0.79056942,  1.27872403,  1.15477905,  0.65865281,
        -0.58546537],
       [-0.69216637, -1.26491106,  1.27872403,  0.29530609,  0.65865281,
        -0.58546537],
       [-0.48813415, -1.26491106, -0.78202957,  0.29530609,  0.65865281,
        -0.58546537],
       [ 0.01681709,  0.79056942, -0.78202957,  1.15477905,  0.65865281,
        -0.58546537],
       [-0.34565355,  0.79056942, -0.78202957,  0.29530609,  0.65865281,
         0.43387166],
       [-0.28752147,  0.79056942,  1.27872403,  1.15477905,  0.65865281,
        -0.58546537],
       [-0.40264579,  0.79056942,  1.27872403,  0.29530609,  0.65865281,
        -0.58546537],
       [ 2.2098785 ,  0.79056942,  1.27872403,  0.29530609,  0.65865281,
         1.45320869],
       [ 0.2687228 ,  0.79056942, -0.78202957,  0.29530609,  0.65865281,
        -0.58546537],
       [ 0.04873275,  0.79056942, -0.78202957,  1.15477905,  0.65865281,
         1.45320869],
       [ 1.76533902, -1.26491106, -0.78202957, -1.42363982, -1.51825055,
         1.45320869],
       [ 0.3884065 , -1.26491106, -0.78202957, -0.56416686,  0.65865281,
        -0.58546537],
       [-0.44709974,  0.79056942, -0.78202957,  1.15477905,  0.65865281,
        -0.58546537],
       [-0.41632393,  0.79056942,  1.27872403,  1.15477905,  0.65865281,
        -0.58546537],
       [ 0.43627998,  0.79056942,  1.27872403,  1.15477905,  0.65865281,
         1.45320869],
       [-0.68076792,  0.79056942, -0.78202957,  0.29530609,  0.65865281,
        -0.58546537],
       [-0.2145714 ,  0.79056942, -0.78202957,  1.15477905,  0.65865281,
        -0.58546537],
       [ 2.49028032,  0.79056942, -0.78202957, -1.42363982, -1.51825055,
         2.47254572],
       [ 0.32913457,  0.79056942, -0.78202957,  1.15477905,  0.65865281,
        -0.58546537],
       [ 0.38954635,  0.79056942, -0.78202957, -1.42363982, -1.51825055,
        -0.58546537],
       [-0.25902535, -1.26491106, -0.78202957,  0.29530609,  0.65865281,
         0.43387166],
       [ 2.12780967,  0.79056942,  1.27872403,  0.29530609,  0.65865281,
         1.45320869],
       [-0.21343156, -1.26491106, -0.78202957, -1.42363982, -1.51825055,
        -0.58546537],
       [-0.6750687 ,  0.79056942,  1.27872403, -0.56416686, -1.51825055,
        -0.58546537],
       [ 0.1900735 ,  0.79056942, -0.78202957,  1.15477905,  0.65865281,
         0.43387166],
       [-0.16213854,  0.79056942, -0.78202957,  0.29530609,  0.65865281,
        -0.58546537],
       [-1.00676354,  0.79056942, -0.78202957,  0.29530609,  0.65865281,
        -0.58546537],
       [ 0.59015903,  0.79056942, -0.78202957,  1.15477905,  0.65865281,
         0.43387166],
       [-0.12110413,  0.79056942, -0.78202957, -1.42363982, -1.51825055,
        -0.58546537],
       [-1.08997221,  0.79056942, -0.78202957, -1.42363982, -1.51825055,
        -0.58546537],
       [-1.11618864,  0.79056942, -0.78202957,  0.29530609,  0.65865281,
        -0.58546537],
       [-0.6089577 ,  0.79056942, -0.78202957,  0.29530609,  0.65865281,
        -0.58546537],
       [-1.32250055,  0.79056942,  1.27872403,  0.29530609,  0.65865281,
        -0.58546537],
       [ 1.8793235 , -1.26491106, -0.78202957,  0.29530609,  0.65865281,
         0.43387166],
       [ 0.17753521, -1.26491106,  1.27872403,  1.15477905,  0.65865281,
         0.43387166],
       [ 1.09738997,  0.79056942,  1.27872403, -0.56416686,  0.65865281,
        -0.58546537],
       [-0.63061475,  0.79056942,  1.27872403,  0.29530609,  0.65865281,
        -0.58546537],
       [ 0.72580056, -1.26491106, -0.78202957,  1.15477905,  0.65865281,
         0.43387166],
       [ 0.67678724, -1.26491106,  1.27872403,  0.29530609,  0.65865281,
        -0.58546537],
       [ 0.59357857,  0.79056942, -0.78202957,  1.15477905,  0.65865281,
         1.45320869],
       [-1.11048942, -1.26491106,  1.27872403,  1.15477905,  0.65865281,
        -0.58546537],
       [ 2.37401615,  0.79056942,  1.27872403, -0.56416686,  0.65865281,
         1.45320869],
       [ 0.42830107,  0.79056942,  1.27872403,  1.15477905,  0.65865281,
         0.43387166]])
In [44]:
models = [LinearRegression(), XGBRegressor(), RandomForestRegressor(), AdaBoostRegressor()] 
for i in range(4): 
    models[i].fit(X_train, Y_train) 
    print(f'{models[i]} : ') 
    pred_train = models[i].predict(X_train) 
    print('Training Accuracy : ', mae(Y_train, pred_train)) 
    pred_val = models[i].predict(X_val) 
    print('Validation Accuracy : ', mae(Y_val, pred_val)) 
    print()
LinearRegression() : 
Training Accuracy :  0.7119950102059002
Validation Accuracy :  0.8394837715187264

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...) : 
Training Accuracy :  0.02740513884715545
Validation Accuracy :  0.8656760615718607

RandomForestRegressor() : 
Training Accuracy :  0.2995215384615385
Validation Accuracy :  0.7950938775510201

AdaBoostRegressor() : 
Training Accuracy :  0.6440256340358487
Validation Accuracy :  0.8072559727229153

In [ ]:
 
In [ ]:
 
In [ ]: